0: Preparation

Defining the input and output files

# Clean the working environment
rm(list = ls())

knitr::opts_chunk$set(echo = TRUE)

# empirical_species <- "German Shepherd"

empirical_dog_breed <- Sys.getenv("empirical_dog_breed")

empirical_species <- paste(toupper(substring(unlist(strsplit(empirical_dog_breed, "_")), 1, 1)),
                             substring(unlist(strsplit(empirical_dog_breed, "_")), 2),
                             sep = "", collapse = " ")
# empirical_species <- "Labrador Retriever"
# document_title <- paste(empirical_species,"Pipeline Results - 50 N_e bottleneck scenario")

# document_title <- paste(empirical_species,"55 Gen Breed Formation - SNP chip: Post_bottleneck_generation - Mutations introduced - 50 N_e bottleneck scenario")

# document_title <- paste(empirical_species,"40 Gen Breed Formation - SNP chip: Post_bottleneck_generation - Mutations introduced - 50 N_e bottleneck scenario")

N_e_bottleneck <- as.numeric(Sys.getenv("N_e_bottleneck"))
n_simulated_generations_breed_formation <- as.numeric(Sys.getenv("n_simulated_generations_breed_formation"))
reference_population_for_snp_chip <- Sys.getenv("reference_population_for_snp_chip")

document_title <- paste0(empirical_species," ",n_simulated_generations_breed_formation," Gen Breed Formation - SNP chip: ",reference_population_for_snp_chip," - ",N_e_bottleneck," N_e bottleneck scenario")

document_sub <- Sys.getenv("document_sub_title") 


# # 
# # MAF_pruning_used <- TRUE
# MAF_pruning_used <- FALSE
# 
# N_e <- 340
# # document_sub <- paste("MAF 0.05 used, but only for H_E-computation, N_e=", N_e)
# # document_sub <- paste("MAF 0.01 used, but only for H_E-computation, N_e=", N_e)
# 
# 
# 
# 
# 
# if (MAF_pruning_used == FALSE) {
#   document_sub <- paste("No MAF-based pruning used, N_e =", N_e)
# 
# 
# } else {
#   document_sub <- paste("MAF-based pruning used, N_e =", N_e)
# }

############################################ 
# Parameters used for displaying the result
############################################ 
ROH_frequency_decimals <- 1
H_e_values_decimals <- 3
F_ROH_values_decimals <- 3
Window_lengths_decimals <- 2

####################################  
# Defining the input files
#################################### 

Selection_strength_test_dir <- Sys.getenv("Selection_strength_test_dir")

Sweep_test_dir <-  Sys.getenv("Sweep_test_dir")

############### 
## Empirical ###
###############

### ROH hotspots ###
Empirical_data_ROH_hotspots_dir  <- Sys.getenv("Empirical_data_ROH_hotspots_dir")
Empirical_data_autosome_ROH_freq_dir <- Sys.getenv("Empirical_data_autosome_ROH_freq_dir")
### Inbreeding coefficient ###

Empirical_data_F_ROH_dir  <- Sys.getenv("Empirical_data_F_ROH_dir")

### Expected Heterozygosity distribution ###
Empirical_data_H_e_dir <- Sys.getenv("Empirical_data_H_e_dir")

############### 
## Simulated ###
###############

### Causative variant ###
variant_freq_plots_dir <- Sys.getenv("variant_freq_plots_dir")
variant_position_dir <- Sys.getenv("variant_position_dir")
pruned_replicates_count_dir <- Sys.getenv("pruned_replicates_count_dir")

Selection_causative_variant_windows_dir <- Sys.getenv("Selection_causative_variant_windows_dir")
causative_variant_windows_H_e_dir <- Sys.getenv("causative_variant_windows_H_e_dir")

### ROH hotspots ###
Neutral_model_ROH_hotspots_dir <- Sys.getenv("Neutral_model_ROH_hotspots_dir")
Neutral_model_autosome_ROH_freq_dir <- Sys.getenv("Neutral_model_autosome_ROH_freq_dir")

Selection_model_ROH_hotspots_dir  <- Sys.getenv("Selection_model_ROH_hotspots_dir")
Selection_model_autosome_ROH_freq_dir <- Sys.getenv("Selection_model_autosome_ROH_freq_dir")

### Inbreeding coefficient ###
Neutral_model_F_ROH_dir  <- Sys.getenv("Neutral_model_F_ROH_dir")
Selection_model_F_ROH_dir  <- Sys.getenv("Selection_model_F_ROH_dir")

### Expected Heterozygosity distribution ###
Neutral_model_H_e_dir <- Sys.getenv("Neutral_model_H_e_dir")
Selection_model_H_e_dir <- Sys.getenv("Selection_model_H_e_dir")


histogram_line_sizes <- 3
empirical_data_color <- "darkgreen"
neutral_model_color <- "blue" 
selection_model_color <- "purple"

output_dir <- Sys.getenv("Pipeline_results_output_dir") 


# Sys.getenv()  

# # Set the working directory for notebook chunks
# knitr::opts_knit$set(root.dir = output_dir_sweep_test)

Loading libraries

if (!require(knitr)) install.packages("knitr", dependencies = TRUE)
## Loading required package: knitr
## Warning: package 'knitr' was built under R version 4.3.2
library(knitr)

if (!require(ggplot2)) install.packages("ggplot2", dependencies = TRUE)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.1
library(ggplot2)

if (!require(scatterplot3d)) install.packages("scatterplot3d", dependencies = TRUE)
## Loading required package: scatterplot3d
library(scatterplot3d) # For generating the 3d plots 

if (!require(RColorBrewer)) install.packages("RColorBrewer", dependencies = TRUE)
## Loading required package: RColorBrewer
library(RColorBrewer) # For generating a color palette

Latex formatting function

Causative variant

Variant fixation

Fixation probability

Fixation time

# Function to determine the number of rows until fixation is reached
time_to_fixation <- function(data, threshold = 0.9) {
  # Find the index of the first row where the allele frequency is 0.9 or higher
  fixation_index <- which(data$allele_frequency >= threshold)[1]
  
  # Return the number of rows until fixation is reached
  return(fixation_index - 1)
}

Summary - Variant fixation

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/Causative_variant_Fixation_summary.txt"
Selection_coefficient Fixation_probability Avg_Fixation_time Min_fixation_time Max_fixation_time
s=0.1 29.0 52 28 68
s=0.2 66.7 32 21 42
s=0.3 80.0 21 16 28
s=0.4 71.4 14 10 18
s=0.5 74.1 11 7 15
s=0.6 74.1 10 7 13
s=0.7 76.9 8 6 10
s=0.8 80.0 7 5 9

Causative variant windows

Variant position

Window lengths

Causative variant windows

Summary - Causative variant windows

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/Causative_variant_windows_summary.txt"
Selection_coefficient Avg_Length_Mb Avg_window_freq Min_freq Max_freq Avg_freq_variant_100k_window
2 s=0.2 1.60 77.4 23.4 100.0 78.4
4 s=0.4 2.12 73.8 21.0 100.0 75.0
6 s=0.6 2.62 82.6 20.0 100.0 83.3
1 s=0.1 2.67 62.4 9.9 99.8 64.0
3 s=0.3 2.72 78.6 19.8 100.0 79.6
7 s=0.7 3.17 76.6 13.7 100.0 77.1
5 s=0.5 3.59 79.3 16.1 100.0 80.5
8 s=0.8 4.78 74.8 20.2 100.0 75.6

Standard Error Confidence interval function

Confidence level <=> konfidensgrad

# Function to calculate standard error and its confidence interval
standard_error_confidence_interval_fun <- function(observed_data, confidence_level = 0.95) {
  # if ( nrow(observed_data) > 1 ) {
  
  if ( length(observed_data) > 1 ) {
      
  # Calculate standard error
  n <- length(observed_data)
  standard_deviation <- sd(observed_data)
  standard_error <- standard_deviation / sqrt(n - 1)
  
  # Calculate confidence interval based on standard error

  alpha <- (1 - confidence_level) / 2
  margin_of_error <- qnorm(1 - alpha) * standard_error
  mean_estimate <- mean(observed_data)
  # Calculate the percentiles for the confidence interval
  confidence_interval_lower_bound <- mean_estimate - margin_of_error # 2.5th percentile (2σ)
  confidence_interval_upper_bound <- mean_estimate + margin_of_error # 97.5th percentile (2σ)
  
  # Return confidence interval
  return(c(confidence_interval_lower_bound, confidence_interval_upper_bound))

    
  } else {
    return(c(NA,NA)) 
    }
  
  
} 





# # Function to calculate bootstrap confidence intervals
# standard_error_confidence_interval_fun <- function(observed_data, n_bootstraps = 1000, confidence_level = 0.95) {
#   
#   # Function to calculate the mean for each bootstrap sample
#   compute_bootstrap_mean_fun <- function(observed_data_urn) {
#     bootstrap_dataset <- sample(observed_data_urn, replace = TRUE)
#     bootstrap_estimate <- mean(bootstrap_dataset)
#     return(bootstrap_estimate)
#   }
#   
#   # Perform bootstrap resampling
#   bootstrap_point_estimates <- replicate(n_bootstraps, compute_bootstrap_mean_fun(observed_data))
#   
#   # Calculate the percentiles for the confidence interval
#   alpha <- (1 - confidence_level) / 2
#   confidence_interval_lower_bound <- quantile(bootstrap_point_estimates, alpha) # 2.5th percentile
#   confidence_interval_upper_bound <- quantile(bootstrap_point_estimates, 1 - alpha) # 97.5th percentile
#   
#   # Return the confidence interval
#   return(c(confidence_interval_lower_bound, confidence_interval_upper_bound))
# }

1: ROH-Frequency

1.1 : Autosome ROH-frequencies

1.1.1 : Empirical - ROH frequency

1.1.2: Neutral Model - ROH frequency

1.1.3: Selection Model

1.1.4 Summary - ROH-hotspot threshold

## ROH-hotspot selection testing results://n
## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/ROH-hotspot_threshold_comparison.txt"
Model Avg_ROH_hotspot_threshold
Neutral 48.9
Empirical 55.4
s=0.1 69.0
s=0.4 79.3
s=0.2 81.5
s=0.7 84.1
s=0.3 84.5
s=0.5 85.0
s=0.8 86.4
s=0.6 90.8

1.2 ROH-hotspots - ROH Frequency and Length

2: Inbreeding coefficient

2.1 Empirical data

## Overall Average Avg_F_ROH for  labrador_retriever : 0.2333818 //n

2.2 Neutral Model

## Point estimate F_ROH across all 20 simulations: 0.2254822 //n
## [1] "Bootstrap 95% Confidence Interval: [0.214418535059809, 0.23654578301248]"

2.3 Selection Model

## Point estimate F_ROH across all 20 simulations for  selection_model_s01_chr3 : 0.2549245 //n[1] "Bootstrap 95% Confidence Interval: [0.237163066303766, 0.272685952973342]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s02_chr3 : 0.2652208 //n[1] "Bootstrap 95% Confidence Interval: [0.244193292022904, 0.286248336892759]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s03_chr3 : 0.2879067 //n[1] "Bootstrap 95% Confidence Interval: [0.271554015829798, 0.304259398628033]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s04_chr3 : 0.3095266 //n[1] "Bootstrap 95% Confidence Interval: [0.287843760718632, 0.331209388678958]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s05_chr3 : 0.314374 //n[1] "Bootstrap 95% Confidence Interval: [0.284803605833935, 0.3439443025998]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s06_chr3 : 0.3708324 //n[1] "Bootstrap 95% Confidence Interval: [0.342645050025221, 0.399019757203695]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s07_chr3 : 0.3490526 //n[1] "Bootstrap 95% Confidence Interval: [0.308572669680324, 0.389532486946182]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s08_chr3 : 0.3606947 //n[1] "Bootstrap 95% Confidence Interval: [0.323633756212886, 0.39775560041362]"

2.4 F_ROH summary

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/F_ROH_comparison.txt"
Model F_ROH Lower_CI Upper_CI
Neutral 0.225 0.214 0.237
Empirical 0.233 NA NA
s=0.1 0.255 0.237 0.273
s=0.2 0.265 0.244 0.286
s=0.3 0.288 0.272 0.304
s=0.4 0.310 0.288 0.331
s=0.5 0.314 0.285 0.344
s=0.7 0.349 0.309 0.390
s=0.8 0.361 0.324 0.398
s=0.6 0.371 0.343 0.399

3: Expected Heterozygosity

3.1 Empirical data

3.2 Neutral Model

3.3 Selection Model

3.4 Causative Variant Window

## Point estimate H_e across all 20 simulations for  s01_chr3 : 0.2617309 //n[1] "Bootstrap 95% Confidence Interval: [0.201450985467319, 0.322010819375992]"
## Point estimate H_e across all 20 simulations for  s02_chr3 : 0.2507339 //n[1] "Bootstrap 95% Confidence Interval: [0.189069438282009, 0.31239837037654]"
## Point estimate H_e across all 20 simulations for  s03_chr3 : 0.2486648 //n[1] "Bootstrap 95% Confidence Interval: [0.179770777695006, 0.317558776389441]"
## Point estimate H_e across all 20 simulations for  s04_chr3 : 0.3106353 //n[1] "Bootstrap 95% Confidence Interval: [0.236423398446369, 0.384847266726958]"
## Point estimate H_e across all 20 simulations for  s05_chr3 : 0.2610883 //n[1] "Bootstrap 95% Confidence Interval: [0.193704426731864, 0.328472141469573]"
## Point estimate H_e across all 20 simulations for  s06_chr3 : 0.3741106 //n[1] "Bootstrap 95% Confidence Interval: [0.298053975916736, 0.450167163763232]"
## Point estimate H_e across all 20 simulations for  s07_chr3 : 0.3782401 //n[1] "Bootstrap 95% Confidence Interval: [0.343893467774569, 0.412586726443371]"
## Point estimate H_e across all 20 simulations for  s08_chr3 : 0.30029 //n[1] "Bootstrap 95% Confidence Interval: [0.224036138921915, 0.376543942403225]"

3.4 Genomewide Average H_e

Model H_e Lower_CI Upper_CI
s=0.6 0.343 0.336 0.350
s=0.8 0.346 0.339 0.354
s=0.5 0.352 0.346 0.359
s=0.7 0.353 0.344 0.361
Empirical 0.355 NA NA
s=0.2 0.356 0.352 0.361
s=0.3 0.357 0.353 0.361
s=0.4 0.357 0.350 0.363
s=0.1 0.360 0.356 0.365
Neutral 0.360 0.357 0.363

3.5 Genomewide 5th percentile comparison - Expected Heterozygosity Summary

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/Expected_Heterozygosity_Summary.txt"
Model H_e_5th_percentile Lower_CI Upper_CI
s=0.6 0.169 0.156 0.182
s=0.8 0.173 0.159 0.187
s=0.7 0.180 0.166 0.193
s=0.5 0.186 0.173 0.200
s=0.4 0.188 0.178 0.199
s=0.3 0.196 0.186 0.205
s=0.2 0.203 0.194 0.213
Empirical 0.209 NA NA
s=0.1 0.210 0.202 0.218
Neutral 0.220 0.215 0.226

4: Summary

4.0 General comparison

4.0.1 ROH-hotspot threshold comparison

## 
##  ROH-hotspot threshold comparison between the different datasets
Model Avg_ROH_hotspot_threshold
Neutral 48.9
Empirical 55.4
s=0.1 69.0
s=0.4 79.3
s=0.2 81.5
s=0.7 84.1
s=0.3 84.5
s=0.5 85.0
s=0.8 86.4
s=0.6 90.8

4.0.2 F_ROH comparison

## Models where empirical F_ROH is within CI:
## [1] "Neutral"
## 
## Models where empirical F_ROH is outside CI:
## [1] "s=0.1" "s=0.2" "s=0.3" "s=0.4" "s=0.5" "s=0.6" "s=0.7" "s=0.8"
## png 
##   2
Model F_ROH Lower_CI Upper_CI
Neutral 0.225 0.214 0.237
Empirical 0.233 NA NA
s=0.1 0.255 0.237 0.273
s=0.2 0.265 0.244 0.286
s=0.3 0.288 0.272 0.304
s=0.4 0.310 0.288 0.331
s=0.5 0.314 0.285 0.344
s=0.7 0.349 0.309 0.390
s=0.8 0.361 0.324 0.398
s=0.6 0.371 0.343 0.399

4.1: Hotspot comparison

4.1.1: Selection test (Sweep test)

## [1] "Selection test results"
## [1] "ROH-hotspot windows with an mean H_e Value lower or equal to the lower confidence interval of the fifth percentile of the neutral model are classified as being under selection"
## [1] "5th percentile of the neutral model is: 0.2145148"
Name Under_selection Window_based_Average_H_e
Hotspot_chr1_window_1 Yes 0.124
Hotspot_chr28_window_1 Yes 0.197
Hotspot_chr24_window_1 No 0.217
Hotspot_chr13_window_1 No 0.225
Hotspot_chr6_window_1 No 0.263
Hotspot_chr15_window_1 No 0.268
Hotspot_chr11_window_2 No 0.284
Hotspot_chr11_window_1 No 0.289
Hotspot_chr8_window_1 No 0.369
## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/ROH_hotspots_Selection_testing_neutral_model_H_E_threshold_0.215.csv.txt"
## [1] "ROH-hotspots under selection:"
Name Under_selection Window_based_Average_H_e
Hotspot_chr1_window_1 Yes 0.124
Hotspot_chr28_window_1 Yes 0.197

4.1.2: Selection Strength Test (Sweep test)

4.1.1.1 Extracting Causative windows under selection

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/Causative_windows_under_selection.txt"
Model H_e Lower_CI Upper_CI Under_Selection
Neutral 0.220 0.215 0.226 No
s=0.3 0.249 0.180 0.318 No
s=0.2 0.251 0.189 0.312 No
s=0.5 0.261 0.194 0.328 No
s=0.1 0.262 0.201 0.322 No
s=0.8 0.300 0.224 0.377 No
s=0.4 0.311 0.236 0.385 No
s=0.6 0.374 0.298 0.450 No
s=0.7 0.378 0.344 0.413 No

4.1.1.1 Extracting Hotspots under selection

*** Behöver bootstrap av 5th percentiles från neutral model

Hotspot - Causative Window - Comparison

Model Lengths_Mb ROH_Freq H_e Under_selection
s=0.8 4.78 74.8 0.300 No
s=0.7 3.17 76.6 0.378 No
s=0.6 2.62 82.6 0.374 No
s=0.5 3.59 79.3 0.261 No
s=0.4 2.12 73.8 0.311 No
s=0.3 2.72 78.6 0.249 No
s=0.2 1.60 77.4 0.251 No
s=0.1 2.67 62.4 0.262 No
Hotspot_chr28_window_1 1.40 75.0 0.197 Yes
Hotspot_chr1_window_1 0.20 61.9 0.124 Yes
# plot_title <- paste("Length And Average ROH % Comparison - ROH Hotspots & Causative Windows")
plot_title <- paste("ROH Hotspot(s) & Causative Windows comparison")


# Modify the labels by removing "Hotspot_" prefix and "_window" suffix from hotspot models
Hotspots_and_Causative_windows_comparison_sorted$Label <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  gsub("Hotspot_|_window", "", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  Hotspots_and_Causative_windows_comparison_sorted$Model
)

# Removing the "s=" part from the selection coefficients for the plot displayal
Hotspots_and_Causative_windows_comparison_sorted$Label <- gsub("^s=", "", Hotspots_and_Causative_windows_comparison_sorted$Label)



# Create a new column for identifying the type
Hotspots_and_Causative_windows_comparison_sorted$Type <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  "Hotspot", 
  "Selection Coefficient"
)


# # Create the 3D scatter plot
# s3d <- scatterplot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
#   pch = 19, # Solid circle
#   xlab = "Lengths",
#   ylab = "ROH Frequency",
#   zlab = "H_e",
#   main = plot_title
# )
# 
# # Add labels to the points
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# text(s3d.coords$x, s3d.coords$y, 
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
#      pos = 3, cex = 0.5) # pos=3 means above


# Create the 3D scatter plot
s3d <- scatterplot3d(
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
  Hotspots_and_Causative_windows_comparison_sorted$H_e,
  color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
  pch = 19, # Solid circle
  xlab = "Avg ROH-frequency (%)",
  ylab = "Length (Mb)",
  zlab = "Avg H_e",
  main = plot_title
)

# )

# Add labels to the points
s3d.coords <- s3d$xyz.convert(
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
  Hotspots_and_Causative_windows_comparison_sorted$H_e
)
text(s3d.coords$x, s3d.coords$y, 
     labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
     pos = 3, cex = 0.5) # pos=3 means above

# Test med skuggor

setwd(output_dir)

# plot_title <- paste("Length And Average ROH % Comparison - ROH Hotspots & Causative Windows")
plot_title <- paste("ROH Hotspot(s) & Causative Windows comparison")

# Modify the labels by removing "Hotspot_" prefix and "_window" suffix from hotspot models
Hotspots_and_Causative_windows_comparison_sorted$Label <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  gsub("Hotspot_|_window", "", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  Hotspots_and_Causative_windows_comparison_sorted$Model
)

# Removing the "s=" part from the selection coefficients for the plot display
Hotspots_and_Causative_windows_comparison_sorted$Label <- gsub("^s=", "", Hotspots_and_Causative_windows_comparison_sorted$Label)

# Create a new column for identifying the type
Hotspots_and_Causative_windows_comparison_sorted$Type <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  "Hotspot", 
  "Selection Coefficient"
)

# Generate a color palette for the hotspots
hotspot_models <- unique(Hotspots_and_Causative_windows_comparison_sorted$Model[Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot"])
num_hotspots <- length(hotspot_models)

# Choose the color palette based on the number of hotspots
if (num_hotspots == 2) {
  color_palette_name <- "Set2"
} else {
  color_palette_name <- "Set3"
}

# Get the colors for the hotspots
hotspot_colors <- setNames(brewer.pal(n = num_hotspots, name = color_palette_name), hotspot_models)
## Warning in brewer.pal(n = num_hotspots, name = color_palette_name): minimal value for n is 3, returning requested palette with 3 different levels
# Assign colors to each point
Hotspots_and_Causative_windows_comparison_sorted$Color <- ifelse(
  Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", 
  hotspot_colors[Hotspots_and_Causative_windows_comparison_sorted$Model], 
  "blue"
)

x_value <- Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq
x_axis_label <- "Avg ROH-frequency (%)"
y_value <- Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
y_axis_label <- "Length (Mb)"
z_value <- Hotspots_and_Causative_windows_comparison_sorted$H_e
z_axis_label <- "Avg H_e"


# x_value <- Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
# x_axis_label <- "Length (Mb)"
# y_value <- Hotspots_and_Causative_windows_comparison_sorted$H_e
# y_axis_label <- "Avg H_e"
# z_value <- Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq
# z_axis_label <- "Avg ROH-frequency (%)"


x_value <- Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq
x_axis_label <- "Avg ROH-frequency (%)"
y_value <- Hotspots_and_Causative_windows_comparison_sorted$H_e
y_axis_label <- "Avg H_e"
z_value <- Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
z_axis_label <- "Length (Mb)"

# x_value <- Hotspots_and_Causative_windows_comparison_sorted$H_e
# x_axis_label <- "Avg H_e"
# y_value <- Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq
# y_axis_label <- "Avg ROH-frequency (%)"
# z_value <- Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
# z_axis_label <- "Length (Mb)"

# # Create and save the 3D scatter plot as a PNG file
# png(filename = "3dplot_Hotspot_Causative_Window_Comparison.png", width = 1920, height = 1080, res = 300)
# png(filename = "3dplot_Hotspot_Causative_Window_Comparison.png",width = 800, height = 600, res = 300)


# Create the 3D scatter plot
s3d <- scatterplot3d(
  x_value  ,
  y_value,
  z_value,
  color = Hotspots_and_Causative_windows_comparison_sorted$Color,
  pch = 19, # Solid circle
  xlab = x_axis_label,
  ylab = y_axis_label,
  zlab = z_axis_label,
  main = plot_title
)

# Add shadows on the x-y plane (z = 0)
s3d$points3d(
  x_value,
  y_value,
  rep(0, nrow(Hotspots_and_Causative_windows_comparison_sorted)),  # Set z to 0 for shadow
  col = adjustcolor(Hotspots_and_Causative_windows_comparison_sorted$Color, alpha.f = 0.3), # Semi-transparent shadows
  pch = 19
)

# Convert coordinates for adding labels
s3d.coords <- s3d$xyz.convert(
  x_value,
  y_value,
  z_value
)

# Add labels to the original points
text(s3d.coords$x, s3d.coords$y, 
     labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
     pos = 3, cex = 0.5) # pos=3 means above

# # Close the graphics device
# dev.off()

Fixar överlappande labels

# # Increase plot size
# par(mar = c(5, 5, 5, 5)) # Adjust margins if needed
# 
# s3d <- scatterplot3d(
#   # x = Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   # y = Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   # z = Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   
#   # x = Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   # y = Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   # z = Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq ,
#   
#   # x = Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb ,
#   # y = Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   # z = Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq ,
#   
#   x = Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   y = Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   z = Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb  ,
# 
#   
#   color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
#   pch = 19, 
#   # xlab = "Avg ROH-frequency (%)",
#   # ylab = "Length (Mb)",
#   # zlab = "H_e",
#   
#   # xlab = "Length (Mb) ",
#   # ylab = "H_e",
#   # zlab = "Avg ROH-frequency (%)",
#   
#   # xlab = "Length (Mb) ",
#   # ylab = "H_e",
#   # zlab = "Avg ROH-frequency (%)",
#   
#   xlab = "Avg ROH-frequency (%)",
#   ylab = "H_e",
#   zlab = "Length (Mb)",
# 
# 
#   
#   main = plot_title
# )
# # # Add labels to the points
# # s3d.coords <- s3d$xyz.convert(
# #   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
# #   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
# #   Hotspots_and_Causative_windows_comparison_sorted$H_e
# # )
# 
# 
# # Add labels to the points
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
# )
# 
# # 
# # # Add jitter to the label positions
# # text(s3d.coords$x + runif(nrow(Hotspots_and_Causative_windows_comparison_sorted), -0.04, 0.04),
# #      s3d.coords$y + runif(nrow(Hotspots_and_Causative_windows_comparison_sorted), -0.04, 0.04),
# #      labels = Hotspots_and_Causative_windows_comparison_sorted$Label,
# #      pos = 3, cex = 0.5)
# # 
# # 
# text(s3d.coords$x, s3d.coords$y,
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label,
#      pos = 3, cex = 0.5) # pos=3 means above

jittering on all axis

# setwd(output_dir)
# 
# # Identify indices for selection and non-selection points based on labels starting with '0'
# selection_indices <- grepl("^0", Hotspots_and_Causative_windows_comparison_sorted$Label)
# non_selection_indices <- !selection_indices
# 
# # windows(width = 1920 / 96, height = 1080 / 96)  # 96 DPI is default
# # # # Create and save the 3D scatter plot as a PNG file
# # png(filename = "3dplot_Hotspot_Causative_Window_Comparison.png", width = 1920, height = 1080)
# 
# # Create the 3D scatter plot with both selection and non-selection points
# s3d <- scatterplot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   color = ifelse(selection_indices, "blue", "red"), # Blue for selection, Red for hotspots
#   pch = 19, # Solid circle
#   xlab = "Avg ROH-frequency (%)",
#   ylab = "Length (Mb)",
#   zlab = "H_e",
#   main = plot_title
# )
# # Convert 3D coordinates for labeling
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# # Subset data for non-selection points
# non_selection_data <- Hotspots_and_Causative_windows_comparison_sorted[non_selection_indices, ]
# # s3d.coords$x[non_selection_indices]
# # 7.888889 6.278889 6.715556 7.655556 6.674444 5.723333 6.143333 5.773333 6.212222
# # custom_jitter_x_axis_hotspots <- c(0,0,-0.05,0,0.1,-0.4,0,0,0)  # Adjust as needed
# custom_jitter_x_axis_hotspots <- c(0,0,0,0,0,0,0,0,0)  # Adjust as needed
# 
# # s3d.coords$y[non_selection_indices]
# #3.7711111 2.4511111 1.9444444 1.5644444 2.5355556 3.1066667 2.6066667 3.1466667 0.8977778
# # custom_jitter_y_axis_hotspots <- c(0,-0.7,-0.65,0,0,-0.3,0,0.1,0)   # Adjust as needed
# custom_jitter_y_axis_hotspots <- c(0,0,0,0,0,0,0,0,0)   # Adjust as needed
# 
# # Add labels for non-selection points without jitter
# text(s3d.coords$x[non_selection_indices]+ custom_jitter_x_axis_hotspots, 
#      s3d.coords$y[non_selection_indices]+ custom_jitter_y_axis_hotspots, 
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label[non_selection_indices], 
#      pos = 3, cex = 0.7)
# 
# 
# 
# selection_data <- Hotspots_and_Causative_windows_comparison_sorted[selection_indices, ]
# custom_jitter_x_axis_sel_coeff <- c(0.2,0.05,0.025,0,-0.05,0,0)
# # custom_jitter_x_axis_sel_coeff <- c(0,0,0,0,0,0,0) 
# custom_jitter_y_axis_sel_coeff <- c(-0.25,0,-0.57,-0.1,-0.6,0,0)
# # custom_jitter_y_axis_sel_coeff <- c(0,0,0,0,0,0,0) 
# # Apply jitter to selection point labels
# label_jitter <- runif(sum(selection_indices), 0, 0.2) # Small random jitter
# # Add labels for selection points with jitter
# text(s3d.coords$x[selection_indices] + custom_jitter_x_axis_sel_coeff, 
#      s3d.coords$y[selection_indices] + custom_jitter_y_axis_sel_coeff, 
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label[selection_indices], 
#      pos = 3, cex = 0.7)

# # Close the graphics device
# dev.off()

# kable(selection_data,row.names = FALSE)
# kable(non_selection_data,row.names = FALSE)

#Jitter z-axis

# # Create the 3D scatter plot with both selection and non-selection points
# s3d <- scatterplot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   color = ifelse(selection_indices, "blue", "red"), # Blue for selection, Red for hotspots
#   pch = 19, # Solid circle
#   xlab = "Avg ROH-frequency (%)",
#   ylab = "Length (Mb)",
#   zlab = "H_e",
#   main = plot_title
# )
# 
# # Convert 3D coordinates for labeling
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# 
# # Generate small random jitter for the y-axis (simulating z-axis movement)
# y_jitter <- runif(sum(selection_indices), -0.02, 0.02)
# 
# # Adjust y-coordinates for selection labels only
# jittered_y <- s3d.coords$y[selection_indices] + y_jitter
# 
# # Add labels for selection points with y-axis adjustment
# text(s3d.coords$x[selection_indices],
#      jittered_y,
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label[selection_indices],
#      pos = 3, cex = 0.7)
# # 
# # Add labels for non-selection points without jitter
# text(s3d.coords$x[non_selection_indices],
#      s3d.coords$y[non_selection_indices],
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label[non_selection_indices],
#      pos = 3, cex = 0.7)
# Sort the data frame based on average fixation time, in descending order
 kable(Hotspots_and_Causative_windows_comparison[order(-Hotspots_and_Causative_windows_comparison$ROH_Freq), ])
Model Lengths_Mb ROH_Freq H_e Under_selection
8 s=0.6 2.62 82.6 0.374 No
7 s=0.5 3.59 79.3 0.261 No
5 s=0.3 2.72 78.6 0.249 No
4 s=0.2 1.60 77.4 0.251 No
9 s=0.7 3.17 76.6 0.378 No
2 Hotspot_chr28_window_1 1.40 75.0 0.197 Yes
10 s=0.8 4.78 74.8 0.300 No
6 s=0.4 2.12 73.8 0.311 No
3 s=0.1 2.67 62.4 0.262 No
1 Hotspot_chr1_window_1 0.20 61.9 0.124 Yes
 kable(Hotspots_and_Causative_windows_comparison[order(-Hotspots_and_Causative_windows_comparison$H_e), ])
Model Lengths_Mb ROH_Freq H_e Under_selection
9 s=0.7 3.17 76.6 0.378 No
8 s=0.6 2.62 82.6 0.374 No
6 s=0.4 2.12 73.8 0.311 No
10 s=0.8 4.78 74.8 0.300 No
3 s=0.1 2.67 62.4 0.262 No
7 s=0.5 3.59 79.3 0.261 No
4 s=0.2 1.60 77.4 0.251 No
5 s=0.3 2.72 78.6 0.249 No
2 Hotspot_chr28_window_1 1.40 75.0 0.197 Yes
1 Hotspot_chr1_window_1 0.20 61.9 0.124 Yes
# # Load the required packages
# library(rgl)
# 
# # Identify indices for selection and non-selection points based on labels starting with '0'
# selection_indices <- grepl("^0", Hotspots_and_Causative_windows_comparison_sorted$Label)
# non_selection_indices <- !selection_indices
# 
# # Open a new 3D plotting device
# open3d()
# 
# # Plot with rgl
# plot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   col = ifelse(selection_indices, "blue", "red"), # Blue for selection, Red for hotspots
#   type = 's', # Use points
#   xlab = "Avg ROH-frequency (%)",
#   ylab = "Length (Mb)",
#   zlab = "H_e",
#   main = plot_title
# )
# 
# # Convert 3D coordinates for labeling
# s3d_coords <- cbind(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# 
# # Add labels for non-selection points with custom jitter
# text3d(
#   s3d_coords[non_selection_indices, ] + cbind(custom_jitter_x_axis_hotspots, custom_jitter_y_axis_hotspots, 0),
#   texts = Hotspots_and_Causative_windows_comparison_sorted$Label[non_selection_indices],
#   col = "red", # Color of the text
#   cex = 0.7
# )
# 
# # Add labels for selection points with custom jitter
# text3d(
#   s3d_coords[selection_indices, ] + cbind(custom_jitter_x_axis_sel_coeff, custom_jitter_y_axis_sel_coeff, label_jitter),
#   texts = Hotspots_and_Causative_windows_comparison_sorted$Label[selection_indices],
#   col = "blue", # Color of the text
#   cex = 0.7
# )

# # Save the 3D plot as a PNG file
# rgl.postscript("3dplot_Hotspot_Causative_Window_Comparison.png", fmt = "png")

# # Close the 3D device
# rgl.close()

Visualizing and scaling

# Min-max-scaling normalization function, that normalizes a vector of values
min_max_normalization_fun <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}
### Min max scaling ###
# Normalize Lengths_Mb
Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb_Normalized <- min_max_normalization_fun(Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb)
# Normalize ROH Frequency
Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq_Normalized <- min_max_normalization_fun(Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq)
# Normalize H_e
Hotspots_and_Causative_windows_comparison_sorted$H_e_Normalized <- min_max_normalization_fun(Hotspots_and_Causative_windows_comparison_sorted$H_e)

# Modify the labels by removing "Hotspot_" prefix and "_window" suffix from hotspot models
Hotspots_and_Causative_windows_comparison_sorted$Label <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  gsub("Hotspot_|_window", "", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  Hotspots_and_Causative_windows_comparison_sorted$Model
)

# Create a new column for identifying the type
Hotspots_and_Causative_windows_comparison_sorted$Type <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  "Hotspot", 
  "Selection Coefficient"
)

plot_title <- paste("Normalized ROH Hotspot(s) & Causative Windows comparison")


# Create the 3D scatter plot
s3d <- scatterplot3d(
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$H_e_Normalized,
  color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
  pch = 19, # Solid circle
  xlab = "Normalized Lengths",
  ylab = "Normalized Mean ROH Frequency",
  zlab = "Normalized H_e",
  main = plot_title
)

# Add labels to the points
s3d.coords <- s3d$xyz.convert(
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$H_e_Normalized
)
text(s3d.coords$x, s3d.coords$y, 
     labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
     pos = 3, cex = 0.5) # pos=3 means above

5 Visualizing Expected Heterozygoisty

5.1 Bootstrap CI for mean 5th percentile H_e

## Models where empirical H_e is within CI for Hotspot_chr1_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr1_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr28_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr28_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr24_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr24_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr13_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr13_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr6_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr6_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr15_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr15_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr11_window_2 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr11_window_2 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr11_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr11_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr8_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr8_window_1 :
## character(0)

5.2 5th percentile of the extreme H_e values